source('prepare_functions.R')
library(randomForest)
library(e1071)
library(caret)
library(ggplot2)
set.seed(123)
#Import Data via Custom Function
data = prepareAndCleanData()
head(data)
## Edible CapShape CapSurface CapColor Bruises Odor GillAttachment
## 1 Poisonous Convex Smooth Brown True Pungent Free
## 2 Edible Convex Smooth Yellow True Almond Free
## 3 Edible Bell Smooth White True Anise Free
## 4 Poisonous Convex Scaly White True Pungent Free
## 5 Edible Convex Smooth Gray False None Free
## 6 Edible Convex Scaly Yellow True Almond Free
## GillSpacing GillSize GillColor StalkShape StalkRoot
## 1 Close Narrow Black Enlarging Equal
## 2 Close Broad Black Enlarging Club
## 3 Close Broad Brown Enlarging Club
## 4 Close Narrow Brown Enlarging Equal
## 5 Crowded Broad Black Tapering Equal
## 6 Close Broad Brown Enlarging Club
## StalkSurfaceAboveRing StalkSurfaceBelowRing StalkColorAboveRing
## 1 Smooth Smooth White
## 2 Smooth Smooth White
## 3 Smooth Smooth White
## 4 Smooth Smooth White
## 5 Smooth Smooth White
## 6 Smooth Smooth White
## StalkColorBelowRing VeilType VeilColor RingNumber RingType
## 1 White Partial White One Pendant
## 2 White Partial White One Pendant
## 3 White Partial White One Pendant
## 4 White Partial White One Pendant
## 5 White Partial White One Evanescent
## 6 White Partial White One Pendant
## SporePrintColor Population Habitat
## 1 Black Scattered Urban
## 2 Brown Numerous Grasses
## 3 Brown Numerous Meadows
## 4 Black Scattered Urban
## 5 Brown Abundnant Grasses
## 6 Black Numerous Grasses
summary(data) #no missing data appears
## Edible CapShape CapSurface CapColor
## Edible :4208 Convex :3656 Scaly :3244 Brown :2284
## Poisonous:3916 Flat :3152 Smooth :2556 Gray :1840
## Knobbed: 828 Fibrous:2320 Red :1500
## Bell : 452 Grooves: 4 Yellow :1072
## Sunken : 32 f : 0 White :1040
## Conical: 4 g : 0 Buff : 168
## (Other): 0 (Other): 0 (Other): 220
## Bruises Odor GillAttachment GillSpacing
## f : 0 None :3528 a : 0 c : 0
## t : 0 Foul :2160 f : 0 w : 0
## True :3376 Fishy : 576 Attached : 210 Close :6812
## False:4748 Spicy : 576 Descending: 0 Crowded:1312
## Almond : 400 Free :7914 Distant: 0
## Anise : 400 Notched : 0
## (Other): 484
## GillSize GillColor StalkShape StalkRoot
## b : 0 Buff :1728 e : 0 Bulbous:3776
## n : 0 Pink :1492 t : 0 Missing:2480
## Broad :5612 White :1202 Enlarging:3516 Equal :1120
## Narrow:2512 Brown :1048 Tapering :4608 Club : 556
## Gray : 752 Rooted : 192
## Chocolate: 732 ? : 0
## (Other) :1170 (Other): 0
## StalkSurfaceAboveRing StalkSurfaceBelowRing StalkColorAboveRing
## Smooth :5176 Smooth :4936 White :4464
## Silky :2372 Silky :2304 Pink :1872
## Fibrous: 552 Fibrous: 600 Gray : 576
## Scaly : 24 Scaly : 284 Brown : 448
## f : 0 f : 0 Buff : 432
## k : 0 k : 0 Orange : 192
## (Other): 0 (Other): 0 (Other): 140
## StalkColorBelowRing VeilType VeilColor RingNumber
## White :4384 p : 0 White :7924 n : 0
## Pink :1872 Partial :8124 Brown : 96 o : 0
## Gray : 576 Universal: 0 Orange : 96 t : 0
## Brown : 512 Yellow : 8 None: 36
## Buff : 432 n : 0 One :7488
## Orange : 192 o : 0 Two : 600
## (Other): 156 (Other): 0
## RingType SporePrintColor Population Habitat
## Pendant :3968 White :2388 Several :4040 Woods :3148
## Evanescent:2776 Brown :1968 Solitary :1712 Grasses:2148
## Large :1296 Black :1872 Scattered:1248 Paths :1144
## Flaring : 48 Chocolate:1632 Numerous : 400 Leaves : 832
## None : 36 Green : 72 Abundnant: 384 Urban : 368
## e : 0 Buff : 48 Clustered: 340 Meadows: 292
## (Other) : 0 (Other) : 144 (Other) : 0 (Other): 192
#Odor and SporePrintColor are the best predictors
p = ggplot(data,aes(x=CapShape,
y=CapSurface,
color=Edible))
p + geom_jitter(alpha=0.3) +
scale_color_manual(breaks = c('Edible','Poisonous'),
values=c('darkgreen','red'))
p = ggplot(data,aes(x=StalkColorBelowRing,
y=StalkColorAboveRing,
color=Edible))
p + geom_jitter(alpha=0.3) +
scale_color_manual(breaks = c('Edible','Poisonous'),
values=c('darkgreen','red'))
p = ggplot(data,aes(x=Odor,
y=SporePrintColor,
color=Edible))
p + geom_jitter(alpha=0.3) +
scale_color_manual(breaks = c('Edible','Poisonous'),
values=c('darkgreen','red'))
p = ggplot(data,aes(x=Edible,
y=Odor,
color = Edible))
p + geom_jitter(alpha=0.2) +
scale_color_manual(breaks = c('Edible','Poisonous'),
values=c('darkgreen','red'))
p = ggplot(data,aes(x=Edible,
y=SporePrintColor,
color = Edible))
p + geom_jitter(alpha=0.2) +
scale_color_manual(breaks = c('Edible','Poisonous'),
values=c('darkgreen','red'))
#Create data for training
sample.ind = sample(2,
nrow(data),
replace = T,
prob = c(0.05,0.95))
data.dev = data[sample.ind==1,]
data.val = data[sample.ind==2,]
# Original Data
table(data$Edible)/nrow(data)
##
## Edible Poisonous
## 0.5179714 0.4820286
# Training Data
table(data.dev$Edible)/nrow(data.dev)
##
## Edible Poisonous
## 0.4962779 0.5037221
# Testing Data
table(data.val$Edible)/nrow(data.val)
##
## Edible Poisonous
## 0.5191037 0.4808963
```
#Fit Random Forest Model
rf = randomForest(Edible ~ .,
ntree = 100,
data = data.dev)
plot(rf)
print(rf)
##
## Call:
## randomForest(formula = Edible ~ ., data = data.dev, ntree = 100)
## Type of random forest: classification
## Number of trees: 100
## No. of variables tried at each split: 4
##
## OOB estimate of error rate: 0.25%
## Confusion matrix:
## Edible Poisonous class.error
## Edible 199 1 0.005
## Poisonous 0 203 0.000
# Variable Importance
varImpPlot(rf,
sort = T,
n.var=10,
main="Top 10 - Variable Importance")
#Variable Importance
var.imp = data.frame(importance(rf,
type=2))
# make row names as columns
var.imp$Variables = row.names(var.imp)
print(var.imp[order(var.imp$MeanDecreaseGini,decreasing = T),])
## MeanDecreaseGini Variables
## Odor 70.3755199 Odor
## SporePrintColor 32.2750755 SporePrintColor
## GillSize 15.2209526 GillSize
## GillColor 13.5667347 GillColor
## RingType 9.6420624 RingType
## Population 8.6651533 Population
## Bruises 7.3112655 Bruises
## StalkSurfaceAboveRing 6.3856365 StalkSurfaceAboveRing
## StalkRoot 5.8295113 StalkRoot
## GillSpacing 4.9118658 GillSpacing
## Habitat 4.3682607 Habitat
## StalkColorBelowRing 4.2249433 StalkColorBelowRing
## StalkSurfaceBelowRing 3.9699018 StalkSurfaceBelowRing
## CapColor 3.5055557 CapColor
## StalkShape 2.8743338 StalkShape
## RingNumber 2.2094357 RingNumber
## StalkColorAboveRing 2.0121177 StalkColorAboveRing
## CapShape 1.0667922 CapShape
## CapSurface 0.9756901 CapSurface
## VeilColor 0.5906364 VeilColor
## GillAttachment 0.0000000 GillAttachment
## VeilType 0.0000000 VeilType
# Predicting response variable
data.dev$predicted.response = predict(rf , data.dev)
# Create Confusion Matrix
print(
confusionMatrix(data = data.dev$predicted.response,
reference = data.dev$Edible,
positive = 'Edible'))
## Confusion Matrix and Statistics
##
## Reference
## Prediction Edible Poisonous
## Edible 200 0
## Poisonous 0 203
##
## Accuracy : 1
## 95% CI : (0.9909, 1)
## No Information Rate : 0.5037
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 1
## Mcnemar's Test P-Value : NA
##
## Sensitivity : 1.0000
## Specificity : 1.0000
## Pos Pred Value : 1.0000
## Neg Pred Value : 1.0000
## Prevalence : 0.4963
## Detection Rate : 0.4963
## Detection Prevalence : 0.4963
## Balanced Accuracy : 1.0000
##
## 'Positive' Class : Edible
##
# Predicting response variable
data.val$predicted.response <- predict(rf ,data.val)
# Create Confusion Matrix
print(
confusionMatrix(data=data.val$predicted.response,
reference=data.val$Edible,
positive='Edible'))
## Confusion Matrix and Statistics
##
## Reference
## Prediction Edible Poisonous
## Edible 3958 8
## Poisonous 50 3705
##
## Accuracy : 0.9925
## 95% CI : (0.9903, 0.9943)
## No Information Rate : 0.5191
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.985
## Mcnemar's Test P-Value : 7.303e-08
##
## Sensitivity : 0.9875
## Specificity : 0.9978
## Pos Pred Value : 0.9980
## Neg Pred Value : 0.9867
## Prevalence : 0.5191
## Detection Rate : 0.5126
## Detection Prevalence : 0.5137
## Balanced Accuracy : 0.9927
##
## 'Positive' Class : Edible
##